In [18]:
import requests
from time import sleep
import pandas as pd
import numpy as np
import bs4
from bs4 import BeautifulSoup
In [241]:
def text_preprocess(text):
t = str()
for i in text.split():
t = t + i + "+"
return t[:-1]
In [192]:
def extract_job_title(soup):
jobs = []
for div in soup.find_all(name="div", attrs={"class":"row"}):
for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
jobs.append(a["title"])
return(jobs)
In [194]:
def extract_company(soup):
companies = []
for div in soup.find_all(name="div", attrs={"class":"row"}):
company = div.find_all(name="span", attrs={"class":"company"})
if len(company) > 0:
for b in company:
companies.append(b.text.strip())
else:
sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
for span in sec_try:
companies.append(span.text.strip())
return(companies)
In [195]:
def extract_location(soup):
locations = []
spans = soup.findAll("span", attrs={"class": "location"})
for span in spans:
locations.append(span.text)
return(locations)
In [211]:
def extract_summary(soup):
summaries = []
spans = soup.findAll("span", attrs={"class": "summary"})
for span in spans:
summaries.append(span.text.strip())
return(summaries)
In [261]:
def get_indeed_jobs(city = 'Seattle', job_title = 'Data Scientist', n_jobs = 100):
city = text_preprocess(city)
job_title = text_preprocess(job_title)
cname = []
jtitle = []
location = []
summary = []
for i in range(0,n_jobs,50):
URL = "https://www.indeed.com/jobs?q=" + job_title + "&l=" + city + "&limit=50&radius=25&start=" + str(i)
page = requests.get(URL)
soup = BeautifulSoup(page.text, "html.parser")
cname = cname + extract_company(soup)
jtitle = jtitle + extract_job_title(soup)
location = location + extract_location(soup)
summary = summary + extract_summary(soup)
sleep(1)
results = pd.DataFrame()
results['Job Title'] = jtitle
results['Company'] = cname
results['Location'] = location
results['Summary'] = summary
return results
In [262]:
get_indeed_jobs(job_title='Data Scientist', city='Seattle', n_jobs=50).head()
Out[262]:
In [263]:
get_indeed_jobs(job_title='Chemical Engineer', city='Seattle', n_jobs=50).head()
Out[263]: